In [1]:
%reload_ext autoreload
%autoreload 2

from torch_snippets import *
from torch_snippets.markup2 import AD
reset_logger()
2023-12-18 14:27:23.639068: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-18 14:27:23.700936: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-18 14:27:24.637388: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
In [2]:
# 0. load config
config = AD(
    pretrained_model_path="checkpoints/stable-diffusion-v1-5/",
    checkpoint_folder="outputs/train_stage_2_v1-2023-12-17T18-31-50/",
    checkpoint_file="outputs/train_stage_2_v1-2023-12-17T18-31-50/checkpoints/checkpoint-epoch-88.ckpt",
    clip_model_path="checkpoints/clip-vit-base-patch32/",
    noise_kwargs = AD(
        num_train_timesteps=1000,
        beta_start=0.00085,
        beta_end=0.012,
        beta_schedule="scaled_linear",
        steps_offset=1,
        clip_sample=False
    ),
    num_inference_steps=25,
    image=AD(
        source_image="/home/ubuntu/data/animate-anyone/TikTok_dataset/00001/images/0001.png",
        size=256,
    ),
    video=AD(
        video_path="/home/ubuntu/data/animate-anyone/TikTok_dataset/00003/00003_dwpose.mp4",
        max_length=24,
        offset=1
    )
)
In [3]:
# 1. load models
from pipelines.pipeline_stage_2 import AnimationAnyonePipeline, DDIMScheduler
from utils.load_models import load_models_stage_2
torch.set_grad_enabled(False) # no need for grad computations

%time models = load_models_stage_2(config)

aapipe = AnimationAnyonePipeline(
    vae=models.vae,
    text_encoder=models.text_encoder,
    tokenizer=models.tokenizer,
    unet=models.unet,
    referencenet=models.referencenet,
    scheduler=DDIMScheduler(**config.noise_kwargs),
)
_ = aapipe.to('cuda')
loaded PoseGuider's pretrained weights from outputs/train_stage_2_v1-2023-12-17T18-31-50/checkpoints/checkpoint-epoch-88.ckpt ...
### missing keys: 0; 
### unexpected keys: 0;
### PoseGuider's Parameters: 0.164792 M
CPU times: user 2min, sys: 14.2 s, total: 2min 14s
Wall time: 35.2 s
In [4]:
# 2. Load video and image
from utils.videoreader import VideoReader

size = config.image.size
max_length = config.video.max_length
offset=config.video.offset

source_image = config.image.source_image
source_image = resize(read(source_image, 1), size)

video_path=config.video.video_path
control = VideoReader(video_path).read()

offset = 1
if control[0].shape[0] != size:
    control = [np.array(Image.fromarray(c).resize((size, size))) for c in control]

if max_length is not None:
    control = control[offset:(offset+max_length)]
control = np.array(control)

torch.Tensor(source_image), torch.Tensor(control)
Out[4]:
(tensor[256, 256, 3] n=196608 (0.8Mb) x∈[0., 255.000] μ=117.427 σ=71.879,
 tensor[24, 256, 256, 3] n=4718592 (18Mb) x∈[0., 255.000] μ=3.466 σ=21.109)
In [5]:
# 2. create empty latents and timesteps

dtype = aapipe.unet.dtype
device = aapipe.unet.device
generator = torch.Generator(device=aapipe.unet.device)
generator.manual_seed(torch.initial_seed())

noisy_latents = aapipe.prepare_latents(
    batch_size=1,
    num_channels_latents=4,
    video_length=24,
    height=size,
    width=size,
    dtype=dtype,
    device=device,
    generator=generator,
    latents=None,
    clip_length=16
)
extra_step_kwargs = aapipe.prepare_extra_step_kwargs(generator, eta=0.0)

aapipe.scheduler.set_timesteps(config.num_inference_steps, device=device)
timesteps = aapipe.scheduler.timesteps
noisy_latents, timesteps
Out[5]:
(tensor[1, 4, 32, 32] n=4096 (16Kb) x∈[-3.709, 3.347] μ=0.001 σ=1.006 cuda:0,
 tensor[25] i64 x∈[1, 961] μ=481.000 σ=294.392 cuda:0)
In [6]:
# 3. Setup
from models.ReferenceNet_attention import ReferenceNetAttention
reference_control_writer = ReferenceNetAttention(aapipe.referencenet, do_classifier_free_guidance=False, mode='write', fusion_blocks='full', is_image=False)
reference_control_reader = ReferenceNetAttention(aapipe.unet, do_classifier_free_guidance=False, mode='read', fusion_blocks='full', is_image=False)
In [7]:
# 4. Make source image vae-latents for referencenet and clip-embeddings for referencenet and unet
source_image_latents = aapipe.images2latents(source_image[None,:], dtype=dtype)
source_image_clip = models.clip_image_processor(images=Image.fromarray(source_image).convert('RGB'), return_tensors="pt").pixel_values.to(device=device)
source_image_clip_embeddings = models.clip_image_encoder(source_image_clip).unsqueeze(1).to(device=device, dtype=dtype)
source_image_latents, source_image_clip_embeddings
Out[7]:
(tensor[1, 4, 32, 32] n=4096 (16Kb) x∈[-4.270, 4.173] μ=0.077 σ=0.912 cuda:0,
 tensor[1, 1, 768] 3Kb x∈[-5.856, 5.765] μ=0.204 σ=0.935 cuda:0)
In [8]:
# 5. Make pose latents using poseguider for unet
#### pose condition ####
pixel_transforms = transforms.Compose([
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
])

pose_condition = torch.from_numpy(control.copy()).to(device=device, dtype=dtype).permute(0, 3, 1, 2) / 255.0
pose_condition = pixel_transforms(pose_condition)
pose_latents = models.poseguider(pose_condition)
pose_latents
Out[8]:
tensor[24, 4, 32, 32] n=98304 (0.4Mb) x∈[-0.033, 0.078] μ=-0.001 σ=0.010 cuda:0
In [9]:
reference_control_reader.clear()
reference_control_writer.clear()
aapipe.referencenet(source_image_latents, timesteps[0], source_image_clip_embeddings)
reference_control_reader.update(reference_control_writer)
In [10]:
t = timesteps[0]
unet_input_latents = aapipe.scheduler.scale_model_input(noisy_latents, t) + pose_latents
unet_input_latents
Out[10]:
tensor[24, 4, 32, 32] n=98304 (0.4Mb) x∈[-3.725, 3.360] μ=0.000 σ=1.005 cuda:0
In [11]:
unet_noise_pred = aapipe.unet(unet_input_latents[None].permute(0, 2, 1, 3, 4), t, encoder_hidden_states=source_image_clip_embeddings).sample[0].permute(1,0,2,3)
unet_noise_pred
Out[11]:
tensor[24, 4, 32, 32] n=98304 (0.4Mb) x∈[-3.612, 3.080] μ=5.489e-05 σ=0.935 cuda:0
In [12]:
unet_output_latents = aapipe.scheduler.step(unet_noise_pred, t, unet_input_latents, **extra_step_kwargs, return_dict=False)[0]
unet_output_latents
Out[12]:
tensor[24, 4, 32, 32] n=98304 (0.4Mb) x∈[-3.753, 3.449] μ=0.000 σ=1.024 cuda:0
In [14]:
video = aapipe.decode_latents(unet_output_latents.detach(), rank=0)
torch.Tensor(video)
Out[14]:
tensor[24, 256, 256, 3] n=4718592 (18Mb) x∈[0., 1.000] μ=0.453 σ=0.341
In [20]:
unet_output_latents.mean((1,2,3)).v
Out[20]:
tensor[24] x∈[-0.000, 0.000] μ=0.000 σ=0.000 cuda:0
tensor([-1.1682e-04, -5.0379e-05,  4.3799e-05,  1.2841e-04,  2.4124e-04,
         3.2459e-04,  3.3582e-04,  4.0947e-04,  3.6748e-04,  2.7905e-04,
         1.8099e-04,  1.7641e-04,  2.6538e-04,  3.1102e-04,  3.4970e-04,
         2.8526e-04,  2.1033e-04,  1.5050e-04,  8.8955e-05,  4.6823e-05,
         7.1743e-05,  8.3228e-05,  6.7699e-05,  7.8972e-05], device='cuda:0')
In [18]:
torch.Tensor(video).mean((1,2,3)).v
Out[18]:
tensor[24] x∈[0.453, 0.454] μ=0.453 σ=0.000
tensor([0.4535, 0.4535, 0.4535, 0.4535, 0.4535, 0.4535, 0.4535, 0.4535, 0.4534,
        0.4534, 0.4534, 0.4535, 0.4535, 0.4535, 0.4535, 0.4534, 0.4533, 0.4533,
        0.4533, 0.4533, 0.4533, 0.4532, 0.4532, 0.4532])
In [15]:
from utils.show_video import animate
animate(video)
[12/18/23 14:29:28] INFO     Animating tensor[24, 256, 256, 3] n=4718592 (18Mb) x∈[0., 1.000] μ=0.453 σ=0.341                                       show_video.py:animate:31
Out[15]:
In [ ]:
from torch_snippets import *
backup_this_notebook('stage-2-1.ipynb')
2023-12-18 14:39:47.201733: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-18 14:39:47.267271: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-18 14:39:48.219394: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
[12/18/23 14:39:49] INFO     Backing up this version of notebook to                                                                       ipython.py:backup_this_notebook:70
                             /home/ubuntu/code/AnimateAnyone-unofficial/backups/stage-2-1/stage-2-1__0000.html